getwd()
## [1] "/Users/tiffa.chua/Desktop/HS614/HS614_Data-Analysis"

HS614: Homework #3

For this HW use the diabetes dataset.

Exploratory Data Analysis: 1. Explore the data 2. How many rows and columns? 3. Any NAs? If yes, which columns? 4. Any strange values? If yes, what to do? 5. Any correlation between features? 6. Statistics of the columns and some exploratory plots (bar plot, histogram, boxplot …)

Build at least 4 different predictive models that can predict (classify) Diabetic patients. Compare the performance of the models.

Column definitions of this dataset are given in table 1 of this paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8306487/


library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.0, built: 2021-05-26)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(caTools)
library(caret)
## Loading required package: lattice
library(class)
library(e1071)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)
## corrplot 0.90 loaded
library(ISLR)
library(cluster)
library(datasets)
library(pastecs)
library(psych)
library(corrplot)
library(stats)
library(Hmisc)
## Loading required package: survival
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following object is masked from 'package:e1071':
## 
##     impute
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(mice)
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble  3.1.3     ✓ purrr   0.3.4
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%()       masks ggplot2::%+%()
## x psych::alpha()     masks ggplot2::alpha()
## x tidyr::extract()   masks pastecs::extract()
## x dplyr::filter()    masks mice::filter(), stats::filter()
## x dplyr::first()     masks pastecs::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks pastecs::last()
## x purrr::lift()      masks caret::lift()
## x dplyr::src()       masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:colorspace':
## 
##     coords
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rpart)
## Warning: package 'rpart' was built under R version 4.1.2
library(rpart.plot)
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.2
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(animation)
diab <- read.csv("./diabetes.csv")
head(diab)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
str(diab)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

Variable Definitions

  1. Pregnancy - Frequency of pregnancy
  2. Glucose - Concentration of plasma glucose (mg/dL)
  3. BP - Diastolic blood pressure (mmHg)
  4. Skin - Tricep skinfold thickness (mm)
  5. Insulin - Two-hour serum insulin (mu U/ml)
  6. BMI - Body mass index (kg/m2)
  7. Pedigree - A pedigree function for diabetes
  8. Age - Age (log(years))
diab$Outcome = factor(diab$Outcome)
str(diab)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 2 ...
table(diab$Outcome)
## 
##   0   1 
## 500 268

Rows & Columns

nrow(diab)
## [1] 768
ncol(diab)
## [1] 9

Missing & Strange Variables

sum(is.na(diab))
## [1] 0
missmap(diab)

#Note: While there are no NA variables, missing data points seem to have been replaced with a value of 0 in columns

Note: While there are no NA variables, missing data points seem to have been replaced with a value of 0 in columns.

sum(diab$Glucose == 0)
## [1] 5
sum(diab$BloodPressure == 0)
## [1] 35
sum(diab$SkinThickness == 0)
## [1] 227
sum(diab$Insulin == 0)
## [1] 374
sum(diab$BMI == 0)
## [1] 11
sum(diab$DiabetesPedigreeFunction == 0)
## [1] 0
sum(diab$Age == 0)
## [1] 0
#Strange Values:
#The following columns returned values of 0 Glucose, BloodPressure, SkinThickness, Insulin, and BMI; It is likely that missing/NA variables for these columns were replaced with 0, as 0 is a highly unlikely and unrealistic observation for these features

Strange Values The following columns returned values of 0: Glucose, BloodPressure, SkinThickness, Insulin, and BMI. It is likely that missing/NA variables for these columns were replaced with 0, as 0 is a highly unlikely and unrealistic observation for these features.

#What to do with the strange values?
diab1 <- diab

#Turn strange (0) values into NA
diab1$Glucose[diab1$Glucose == 0] = NA
diab1$BloodPressure[diab1$BloodPressure == 0] = NA
diab1$SkinThickness[diab1$SkinThickness == 0] = NA
diab1$Insulin[diab1$Insulin == 0] = NA
diab1$BMI[diab1$BMI == 0] = NA

sum(is.na(diab1))
## [1] 652
missmap(diab1)

#I will use mean imputation on the NA variables in the Predictive Models section (after Train/Test Split)

What to do with the strange variables? 1. Turn strange (0) values into NA. 2. I will use mean imputation on the NA variables in the Predicive Models section (after Train/Test Split).


Exploratory Data Analysis

Initial Read

Descriptive Statistics

stat.desc(diab1)
##               Pregnancies      Glucose BloodPressure SkinThickness      Insulin
## nbr.val       768.0000000 7.630000e+02  7.330000e+02  5.410000e+02 3.940000e+02
## nbr.null      111.0000000 0.000000e+00  0.000000e+00  0.000000e+00 0.000000e+00
## nbr.na          0.0000000 5.000000e+00  3.500000e+01  2.270000e+02 3.740000e+02
## min             0.0000000 4.400000e+01  2.400000e+01  7.000000e+00 1.400000e+01
## max            17.0000000 1.990000e+02  1.220000e+02  9.900000e+01 8.460000e+02
## range          17.0000000 1.550000e+02  9.800000e+01  9.200000e+01 8.320000e+02
## sum          2953.0000000 9.284700e+04  5.307300e+04  1.577200e+04 6.128600e+04
## median          3.0000000 1.170000e+02  7.200000e+01  2.900000e+01 1.250000e+02
## mean            3.8450521 1.216868e+02  7.240518e+01  2.915342e+01 1.555482e+02
## SE.mean         0.1215892 1.105464e+00  4.573454e-01  4.504407e-01 5.983841e+00
## CI.mean.0.95    0.2386871 2.170117e+00  8.978652e-01  8.848307e-01 1.176434e+01
## var            11.3540563 9.324254e+02  1.533178e+02  1.097672e+02 1.410770e+04
## std.dev         3.3695781 3.053564e+01  1.238216e+01  1.047698e+01 1.187759e+02
## coef.var        0.8763413 2.509364e-01  1.710120e-01  3.593740e-01 7.635951e-01
##                       BMI DiabetesPedigreeFunction          Age Outcome
## nbr.val      7.570000e+02             768.00000000 7.680000e+02      NA
## nbr.null     0.000000e+00               0.00000000 0.000000e+00      NA
## nbr.na       1.100000e+01               0.00000000 0.000000e+00      NA
## min          1.820000e+01               0.07800000 2.100000e+01      NA
## max          6.710000e+01               2.42000000 8.100000e+01      NA
## range        4.890000e+01               2.34200000 6.000000e+01      NA
## sum          2.457030e+04             362.40100000 2.552900e+04      NA
## median       3.230000e+01               0.37250000 2.900000e+01      NA
## mean         3.245746e+01               0.47187630 3.324089e+01      NA
## SE.mean      2.516930e-01               0.01195579 4.243608e-01      NA
## CI.mean.0.95 4.941002e-01               0.02346996 8.330464e-01      NA
## var          4.795546e+01               0.10977864 1.383030e+02      NA
## std.dev      6.924988e+00               0.33132860 1.176023e+01      NA
## coef.var     2.133558e-01               0.70215138 3.537882e-01      NA
summary(diab1)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     Insulin            BMI        DiabetesPedigreeFunction      Age       
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780           Min.   :21.00  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437           1st Qu.:24.00  
##  Median :125.00   Median :32.30   Median :0.3725           Median :29.00  
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##  NA's   :374      NA's   :11                                              
##  Outcome
##  0:500  
##  1:268  
##         
##         
##         
##         
## 

Correlations

diab_num <- diab1[1:8]

M <- cor(diab_num,
         method = "pearson")
M
##                          Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies               1.00000000      NA            NA            NA
## Glucose                           NA       1            NA            NA
## BloodPressure                     NA      NA             1            NA
## SkinThickness                     NA      NA            NA             1
## Insulin                           NA      NA            NA            NA
## BMI                               NA      NA            NA            NA
## DiabetesPedigreeFunction -0.03352267      NA            NA            NA
## Age                       0.54434123      NA            NA            NA
##                          Insulin BMI DiabetesPedigreeFunction        Age
## Pregnancies                   NA  NA              -0.03352267 0.54434123
## Glucose                       NA  NA                       NA         NA
## BloodPressure                 NA  NA                       NA         NA
## SkinThickness                 NA  NA                       NA         NA
## Insulin                        1  NA                       NA         NA
## BMI                           NA   1                       NA         NA
## DiabetesPedigreeFunction      NA  NA               1.00000000 0.03356131
## Age                           NA  NA               0.03356131 1.00000000
corrplot(M,
         method = "square")

cor.plot(diab_num)

#Highest Correlating Variables:
# BMI & SkinThickness = 0.65
# Insulin & Glucose = 0.58
# Pregnancies & Age = 0.54
# BloodPressure & Age = 0.33
# BMI & BloodPressure = 0.29

Highest Correlating Variables: * BMI & SkinThickness = 0.65 * Insulin & Glucose = 0.58 * Pregnancies & Age = 0.54 * BloodPressure & Age = 0.33 * BMI & BloodPressure = 0.29

Plots

ggpairs(diab1)
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 227 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 40 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 232 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 375 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 5 rows containing missing values
## Warning: Removed 5 rows containing non-finite values (stat_boxplot).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 40 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 229 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 39 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 35 rows containing missing values
## Warning: Removed 35 rows containing non-finite values (stat_boxplot).
## Warning: Removed 227 rows containing missing values (geom_point).
## Warning: Removed 232 rows containing missing values (geom_point).
## Warning: Removed 229 rows containing missing values (geom_point).
## Warning: Removed 227 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 229 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 227 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 227 rows containing missing values
## Warning: Removed 227 rows containing non-finite values (stat_boxplot).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 375 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).

## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 375 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 374 rows containing missing values
## Warning: Removed 374 rows containing non-finite values (stat_boxplot).
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 39 rows containing missing values (geom_point).
## Warning: Removed 229 rows containing missing values (geom_point).
## Warning: Removed 375 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 11 rows containing missing values
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 227 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 35 rows containing missing values (geom_point).
## Warning: Removed 227 rows containing missing values (geom_point).
## Warning: Removed 374 rows containing missing values (geom_point).
## Warning: Removed 11 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 227 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 374 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Outcome Barplot

ggplot(diab1,
       aes(Outcome)) +
  geom_bar()

Histograms, grouped by Outcome

# Grouped
# p <- data %>%
#   ggplot( aes(x=value, fill=type)) +
#     geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
#     scale_fill_manual(values=c("#69b3a2", "#404080")) +
#     theme_ipsum() +
#     labs(fill="")

Glucose_hist <- diab1 %>%
  ggplot(aes(x = Glucose,
             fill = Outcome)) +
  geom_histogram(binwidth = 5)
Glucose_hist
## Warning: Removed 5 rows containing non-finite values (stat_bin).

BP_hist <- diab1 %>%
  ggplot(aes(x = BloodPressure,
             fill = Outcome)) +
  geom_histogram(binwidth = 5)
BP_hist
## Warning: Removed 35 rows containing non-finite values (stat_bin).

BMI_hist <- diab1 %>%
  ggplot(aes(x = BMI,
             fill = Outcome)) +
  geom_histogram(binwidth = 2)
BMI_hist
## Warning: Removed 11 rows containing non-finite values (stat_bin).

Ped_hist <- diab1 %>%
  ggplot(aes(x = DiabetesPedigreeFunction,
             fill = Outcome)) +
  geom_histogram(binwidth = 0.075)
Ped_hist

Age_hist <- diab1 %>%
  ggplot(aes(x = Age,
             fill = Outcome)) +
  geom_histogram(binwidth = 2)
Age_hist

Scatter Plots, grouped by Outcome

#Highest Correlating Variables:
# BMI & SkinThickness = 0.65
# Insulin & Glucose = 0.58
# Pregnancies & Age = 0.54
# BloodPressure & Age = 0.33
# BMI & BloodPressure = 0.29

# BMI & SkinThickness = 0.65
BMI_Skin <- ggplot(data = diab1,
                   aes(x = BMI,
                       y = SkinThickness,
                       color = Outcome),
                   na.rm = TRUE) +
  geom_point(size = 0.5) +
  geom_smooth(method = lm) +
  labs(title = "BMI vs. Skin Thickness")
BMI_Skin
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 229 rows containing non-finite values (stat_smooth).
## Warning: Removed 229 rows containing missing values (geom_point).

# Insulin & Glucose = 0.58
Insu_Glucose <- ggplot(data = diab1,
                       aes(x = Insulin,
                           y = Glucose,
                           color = Outcome),
                       na.rm = TRUE) +
  geom_point(size = 0.5) +
  geom_smooth(method = lm) +
  labs(title = "Insulin vs. Glucose")
Insu_Glucose
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 375 rows containing non-finite values (stat_smooth).
## Warning: Removed 375 rows containing missing values (geom_point).

# Pregnancies & Age = 0.54
Age_Preg <- ggplot(data = diab1,
                   aes(x = Age,
                       y = Pregnancies,
                       color = Outcome),
                   na.rm = TRUE) +
  geom_point(size = 0.5) +
  geom_smooth(method = lm) +
  labs(title = "Age vs. Pregnancies")
Age_Preg
## `geom_smooth()` using formula 'y ~ x'

# BloodPressure & Age = 0.33
Age_BP <- ggplot(data = diab1,
                    aes(x = Age,
                        y = BloodPressure,
                        color = Outcome),
                    na.rm = TRUE) +
  geom_point(size = 0.5) +
  geom_smooth(method = lm) +
  labs(title = "Age vs. Blood Pressure")
Age_BP
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 35 rows containing non-finite values (stat_smooth).
## Warning: Removed 35 rows containing missing values (geom_point).

# Glucose & Age = 0.32
Age_Glucose <- ggplot(data = diab1,
                      aes(x = Age,
                          y = Glucose,
                          color = Outcome),
                      na.rm = TRUE) +
  geom_point(size = 0.5) +
  geom_smooth(method = lm) +
  labs(title = "Age vs. Glucose")
Age_Glucose
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).

# BloodPressure & BMI = 0.29
BMI_BP <- ggplot(data = diab1,
                 aes(x = BMI,
                     y = BloodPressure,
                     color = Outcome),
                 na.rm = TRUE) +
  geom_point(size = 0.5) +
  geom_smooth(method = lm) +
  labs(title = "BMI vs. Blood Pressure")
BMI_BP
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 39 rows containing non-finite values (stat_smooth).
## Warning: Removed 39 rows containing missing values (geom_point).


Predictive Models

Train & Test Split

set.seed(123)
split <- sample.split(diab1$Outcome,
                      SplitRatio = 0.8)
diab.train <- subset(diab1,
                     split == TRUE)
diab.test <- subset(subset(diab1,
                           split == FALSE))

nrow(diab.test)
## [1] 154
nrow(diab.train)
## [1] 614
head(diab.train)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35      NA 33.6
## 2           1      85            66            29      NA 26.6
## 3           8     183            64            NA      NA 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 8          10     115            NA            NA      NA 35.3
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 8                    0.134  29       0
str(diab.train)
## 'data.frame':    614 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 10 8 4 10 10 ...
##  $ Glucose                 : int  148 85 183 89 137 115 125 110 168 139 ...
##  $ BloodPressure           : int  72 66 64 66 40 NA 96 92 74 80 ...
##  $ SkinThickness           : int  35 29 NA 23 35 NA NA NA NA NA ...
##  $ Insulin                 : int  NA NA NA 94 168 NA NA NA NA NA ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 35.3 NA 37.6 38 27.1 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 29 54 30 34 57 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
nrow(diab.train)
## [1] 614
ncol(diab.train)
## [1] 9
sum(is.na(diab.train))
## [1] 536
sum(is.na(diab.test))
## [1] 116
diab.train$Glucose[is.na(diab.train$Glucose)] <- mean(diab.train$Glucose,
                                          na.rm = TRUE)
diab.train$BloodPressure[is.na(diab.train$BloodPressure)] <- mean(diab.train$BloodPressure,
                                                      na.rm = TRUE)
diab.train$SkinThickness[is.na(diab.train$SkinThickness)] <- mean(diab.train$SkinThickness,
                                                      na.rm = TRUE)
diab.train$Insulin[is.na(diab.train$Insulin)] <- mean(diab.train$Insulin,
                                          na.rm = TRUE)
diab.train$BMI[is.na(diab.train$BMI)] <- mean(diab.train$BMI,
                                  na.rm = TRUE)

sum(is.na(diab.train))
## [1] 0
str(diab.train)
## 'data.frame':    614 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 10 8 4 10 10 ...
##  $ Glucose                 : num  148 85 183 89 137 115 125 110 168 139 ...
##  $ BloodPressure           : num  72 66 64 66 40 ...
##  $ SkinThickness           : num  35 29 29.2 23 35 ...
##  $ Insulin                 : num  154 154 154 94 168 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 29 54 30 34 57 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
diab.test$Glucose[is.na(diab.test$Glucose)] <- mean(diab.train$Glucose,
                                          na.rm = TRUE)
diab.test$BloodPressure[is.na(diab.test$BloodPressure)] <- mean(diab.train$BloodPressure,
                                                      na.rm = TRUE)
diab.test$SkinThickness[is.na(diab.test$SkinThickness)] <- mean(diab.train$SkinThickness,
                                                      na.rm = TRUE)
diab.test$Insulin[is.na(diab.test$Insulin)] <- mean(diab.train$Insulin,
                                          na.rm = TRUE)
diab.test$BMI[is.na(diab.test$BMI)] <- mean(diab.train$BMI,
                                  na.rm = TRUE)

sum(is.na(diab.test))
## [1] 0
head(diab.test)
##    Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 6            5     116            74      29.17647 154.3137 25.6
## 7            3      78            50      32.00000  88.0000 31.0
## 9            2     197            70      45.00000 543.0000 30.5
## 14           1     189            60      23.00000 846.0000 30.1
## 17           0     118            84      47.00000 230.0000 45.8
## 22           8      99            84      29.17647 154.3137 35.4
##    DiabetesPedigreeFunction Age Outcome
## 6                     0.201  30       0
## 7                     0.248  26       1
## 9                     0.158  53       1
## 14                    0.398  59       1
## 17                    0.551  31       1
## 22                    0.388  50       0

Logistic Regression

Logistic Regression Models

logModel1 <- glm(formula = Outcome ~ . , 
                 family = binomial,
                 data = diab.train)
summary(logModel1)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = diab.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5108  -0.7124  -0.3915   0.6682   2.1827  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.799113   0.892047  -9.864  < 2e-16 ***
## Pregnancies               0.135168   0.037465   3.608 0.000309 ***
## Glucose                   0.040167   0.004477   8.972  < 2e-16 ***
## BloodPressure            -0.015105   0.009399  -1.607 0.108045    
## SkinThickness             0.005799   0.015364   0.377 0.705834    
## Insulin                  -0.002570   0.001424  -1.805 0.071029 .  
## BMI                       0.096264   0.020274   4.748 2.05e-06 ***
## DiabetesPedigreeFunction  0.559306   0.337462   1.657 0.097441 .  
## Age                       0.014412   0.010702   1.347 0.178086    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 793.94  on 613  degrees of freedom
## Residual deviance: 564.46  on 605  degrees of freedom
## AIC: 582.46
## 
## Number of Fisher Scoring iterations: 5
#Based on logModel1, the significant values are as follows:
# ***: Glucose
# *: BMI 
# .: Insulin

#AIC: 284.17

Based on logModel1, the significant values are as follows: * **: Glucose : BMI .: Insulin

AIC: 284.17

logModel2 <- glm(formula = Outcome ~ Glucose + BMI + Insulin, 
                 family = binomial,
                 data = diab.train)
summary(logModel2)
## 
## Call:
## glm(formula = Outcome ~ Glucose + BMI + Insulin, family = binomial, 
##     data = diab.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2588  -0.7506  -0.4374   0.7348   2.3611  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.339728   0.733854 -11.364   <2e-16 ***
## Glucose      0.041933   0.004311   9.728   <2e-16 ***
## BMI          0.085804   0.016109   5.326    1e-07 ***
## Insulin     -0.002365   0.001381  -1.713   0.0868 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 793.94  on 613  degrees of freedom
## Residual deviance: 595.13  on 610  degrees of freedom
## AIC: 603.13
## 
## Number of Fisher Scoring iterations: 4
#Based on logModel2, the significant values are as follows:
# ***: Glucose, BMI
# .: Insulin

#AIC: 289.49
#Because this AIC is higher  than for logModel1 (289.49 > 284.17), we can assume that logModel1 is better than logModel2

Based on logModel2, the significant values are as follows: * **: Glucose, BMI .: Insulin

AIC: 289.49 Because this AIC is higher than for logModel1 (289.49 > 284.17), we can assume that logModel1 is better than logModel2.

logModel3 <- glm(formula = Outcome ~ Glucose + BMI, 
                 family = binomial,
                 data = diab.train)
summary(logModel3)
## 
## Call:
## glm(formula = Outcome ~ Glucose + BMI, family = binomial, data = diab.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2748  -0.7531  -0.4407   0.7271   2.2838  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.235682   0.723816 -11.378  < 2e-16 ***
## Glucose      0.039075   0.003874  10.086  < 2e-16 ***
## BMI          0.082170   0.015837   5.189 2.12e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 793.94  on 613  degrees of freedom
## Residual deviance: 597.95  on 611  degrees of freedom
## AIC: 603.95
## 
## Number of Fisher Scoring iterations: 4
#Based on logModel3, the significant values are as follows:
# ***: Glucose, BMI

#AIC: 590.81
#Because this AIC is higher  than for logModel1 and logModel2 (590.81 > 289.49 > 284.17), we can assume that this model is not as good as logModel1 or logModel2

Based on logModel3, the significant values are as follows: * ***: Glucose, BMI

AIC: 590.81 Because this AIC is higher than for logModel1 and logModel2 (590.81 > 289.49 > 284.17), we can assume that this model is not as good as logModel1 or logModel2.

Logistic Regression Prediction

Outcome.predictions <- predict(logModel1,
                               diab.test,
                               type = "response")

log.results <- cbind(Outcome.predictions,
                     diab.test$Outcome)

results.class <- ifelse(log.results > 0.5, 1,0) 
colnames(results.class) <- c("pred", "real")
results.class <- as.data.frame(results.class)

head(results.class)
##    pred real
## 6     0    1
## 7     0    1
## 9     1    1
## 14    0    1
## 17    0    1
## 22    0    1

Logistic Regression Confusion Matrix

table(diab.test$Outcome,
      results.class$pred)
##    
##      0  1
##   0 84 16
##   1 24 30

K-Nearest Neighbor

head(diab.train)
##   Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 1           6     148      72.00000      35.00000 154.3137 33.6
## 2           1      85      66.00000      29.00000 154.3137 26.6
## 3           8     183      64.00000      29.17647 154.3137 23.3
## 4           1      89      66.00000      23.00000  94.0000 28.1
## 5           0     137      40.00000      35.00000 168.0000 43.1
## 8          10     115      72.43027      29.17647 154.3137 35.3
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 8                    0.134  29       0
head(diab.test)
##    Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 6            5     116            74      29.17647 154.3137 25.6
## 7            3      78            50      32.00000  88.0000 31.0
## 9            2     197            70      45.00000 543.0000 30.5
## 14           1     189            60      23.00000 846.0000 30.1
## 17           0     118            84      47.00000 230.0000 45.8
## 22           8      99            84      29.17647 154.3137 35.4
##    DiabetesPedigreeFunction Age Outcome
## 6                     0.201  30       0
## 7                     0.248  26       1
## 9                     0.158  53       1
## 14                    0.398  59       1
## 17                    0.551  31       1
## 22                    0.388  50       0
str(diab.train)
## 'data.frame':    614 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 10 8 4 10 10 ...
##  $ Glucose                 : num  148 85 183 89 137 115 125 110 168 139 ...
##  $ BloodPressure           : num  72 66 64 66 40 ...
##  $ SkinThickness           : num  35 29 29.2 23 35 ...
##  $ Insulin                 : num  154 154 154 94 168 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 29 54 30 34 57 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
str(diab.test)
## 'data.frame':    154 obs. of  9 variables:
##  $ Pregnancies             : int  5 3 2 1 0 8 11 9 2 0 ...
##  $ Glucose                 : num  116 78 197 189 118 99 143 102 90 180 ...
##  $ BloodPressure           : num  74 50 70 60 84 84 94 76 68 66 ...
##  $ SkinThickness           : num  29.2 32 45 23 47 ...
##  $ Insulin                 : num  154 88 543 846 230 ...
##  $ BMI                     : num  25.6 31 30.5 30.1 45.8 35.4 36.6 32.9 38.2 42 ...
##  $ DiabetesPedigreeFunction: num  0.201 0.248 0.158 0.398 0.551 ...
##  $ Age                     : int  30 26 53 59 31 50 51 46 27 25 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 2 2 2 2 ...
sum(is.na(diab.train))
## [1] 0
sum(is.na(diab.test))
## [1] 0

KNN Fitting

#Fitting K-NN to the Training set and Predicting the Test set results
vec = c()
k_vec = c()

y_pred = knn(train = diab.train[ , -9],
             test = diab.test[ , -9],
             cl = diab.train[ , 9],
             k = 10)

head(y_pred)
## [1] 0 0 1 1 0 0
## Levels: 0 1
#Fitting K-NN to the Training set and Predicting the Test set results
vec = c()
k_vec = c()

for (k in 1:50){
y_pred = knn(train = diab.train[ , -9],
             test = diab.test[ , -9],
             cl = diab.train[ , 9],
             k = k)

error = mean(y_pred != diab.test$Outcome) #measure of accuracy
k_vec = c(k_vec, k)
vec = c(vec, error)
}

head(vec)
## [1] 0.3051948 0.3376623 0.3116883 0.3441558 0.3051948 0.2922078
df.error = data.frame(k_vec, vec)
head(df.error)
##   k_vec       vec
## 1     1 0.3051948
## 2     2 0.3376623
## 3     3 0.3116883
## 4     4 0.3441558
## 5     5 0.3051948
## 6     6 0.2922078

Elbow Method

ggplot(df.error,
       aes(k_vec, vec)) +
  geom_line()

KNN Confusion Matrix

y_pred = knn(train = diab.train[, -9],
             test = diab.test[, -9],
             cl = diab.train[, 9],
             k = 9)

# Making the Confusion Matrix
cm = table(diab.test[, 9], y_pred)
cm
##    y_pred
##      0  1
##   0 84 16
##   1 23 31
confusionMatrix(diab.test[ , 9],
                y_pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 84 16
##          1 23 31
##                                           
##                Accuracy : 0.7468          
##                  95% CI : (0.6705, 0.8133)
##     No Information Rate : 0.6948          
##     P-Value [Acc > NIR] : 0.09312         
##                                           
##                   Kappa : 0.4268          
##                                           
##  Mcnemar's Test P-Value : 0.33667         
##                                           
##             Sensitivity : 0.7850          
##             Specificity : 0.6596          
##          Pos Pred Value : 0.8400          
##          Neg Pred Value : 0.5741          
##              Prevalence : 0.6948          
##          Detection Rate : 0.5455          
##    Detection Prevalence : 0.6494          
##       Balanced Accuracy : 0.7223          
##                                           
##        'Positive' Class : 0               
## 

Kappa: 0.4268

Decision Tree & Random Forest

head(diab.train)
##   Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 1           6     148      72.00000      35.00000 154.3137 33.6
## 2           1      85      66.00000      29.00000 154.3137 26.6
## 3           8     183      64.00000      29.17647 154.3137 23.3
## 4           1      89      66.00000      23.00000  94.0000 28.1
## 5           0     137      40.00000      35.00000 168.0000 43.1
## 8          10     115      72.43027      29.17647 154.3137 35.3
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 8                    0.134  29       0
head(diab.test)
##    Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 6            5     116            74      29.17647 154.3137 25.6
## 7            3      78            50      32.00000  88.0000 31.0
## 9            2     197            70      45.00000 543.0000 30.5
## 14           1     189            60      23.00000 846.0000 30.1
## 17           0     118            84      47.00000 230.0000 45.8
## 22           8      99            84      29.17647 154.3137 35.4
##    DiabetesPedigreeFunction Age Outcome
## 6                     0.201  30       0
## 7                     0.248  26       1
## 9                     0.158  53       1
## 14                    0.398  59       1
## 17                    0.551  31       1
## 22                    0.388  50       0
str(diab.train)
## 'data.frame':    614 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 10 8 4 10 10 ...
##  $ Glucose                 : num  148 85 183 89 137 115 125 110 168 139 ...
##  $ BloodPressure           : num  72 66 64 66 40 ...
##  $ SkinThickness           : num  35 29 29.2 23 35 ...
##  $ Insulin                 : num  154 154 154 94 168 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 29 54 30 34 57 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 2 1 2 1 2 1 2 1 2 1 ...
str(diab.test)
## 'data.frame':    154 obs. of  9 variables:
##  $ Pregnancies             : int  5 3 2 1 0 8 11 9 2 0 ...
##  $ Glucose                 : num  116 78 197 189 118 99 143 102 90 180 ...
##  $ BloodPressure           : num  74 50 70 60 84 84 94 76 68 66 ...
##  $ SkinThickness           : num  29.2 32 45 23 47 ...
##  $ Insulin                 : num  154 88 543 846 230 ...
##  $ BMI                     : num  25.6 31 30.5 30.1 45.8 35.4 36.6 32.9 38.2 42 ...
##  $ DiabetesPedigreeFunction: num  0.201 0.248 0.158 0.398 0.551 ...
##  $ Age                     : int  30 26 53 59 31 50 51 46 27 25 ...
##  $ Outcome                 : Factor w/ 2 levels "0","1": 1 2 2 2 2 1 2 2 2 2 ...
sum(is.na(diab.train))
## [1] 0
sum(is.na(diab.test))
## [1] 0

Decision Tree

Tree

tree = rpart(formula = Outcome ~ .,
             method = "class",
             data = diab.train)

Tree Prediction

#Predicting the Test Set Results
y_pred = predict(tree,
                 newdata = diab.test)

head(y_pred)
##            0          1
## 6  0.9491525 0.05084746
## 7  0.9298246 0.07017544
## 9  0.1500000 0.85000000
## 14 0.1500000 0.85000000
## 17 0.1666667 0.83333333
## 22 0.9298246 0.07017544
y_pred_class = ifelse(y_pred[, "0"] >= 0.5,
                      0, 1)

Tree Confusion Matrix

# Making the Confusion Matrix
cm = table(diab.test$Outcome,
           y_pred_class)

cm
##    y_pred_class
##      0  1
##   0 88 12
##   1 24 30

Tree ROC

result.roc = roc(diab.test$Outcome,
                 y_pred[ ,"0"])
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
result.roc
## 
## Call:
## roc.default(response = diab.test$Outcome, predictor = y_pred[,     "0"])
## 
## Data: y_pred[, "0"] in 100 controls (diab.test$Outcome 0) > 54 cases (diab.test$Outcome 1).
## Area under the curve: 0.7947
plot(result.roc)

Decision Trees

plot(tree,
     uniform=TRUE,
     main="Diabetes Outcome")
text(tree,
     use.n=TRUE,
     all=TRUE)

prp(tree)

Random Forest

RF <- randomForest(formula = Outcome ~ .,
                   method='class',
                   data = diab.train)
RF
## 
## Call:
##  randomForest(formula = Outcome ~ ., data = diab.train, method = "class") 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 25.57%
## Confusion matrix:
##     0   1 class.error
## 0 340  60    0.150000
## 1  97 117    0.453271
importance(RF)
##                          MeanDecreaseGini
## Pregnancies                      23.45711
## Glucose                          71.64581
## BloodPressure                    24.75545
## SkinThickness                    19.84609
## Insulin                          22.99613
## BMI                              44.23679
## DiabetesPedigreeFunction         32.54496
## Age                              37.69331
#MeanDecreaseGini --> Importance (IncNodePurity)
diab.test1 <- diab.test[-9]
head(diab.test1)
##    Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 6            5     116            74      29.17647 154.3137 25.6
## 7            3      78            50      32.00000  88.0000 31.0
## 9            2     197            70      45.00000 543.0000 30.5
## 14           1     189            60      23.00000 846.0000 30.1
## 17           0     118            84      47.00000 230.0000 45.8
## 22           8      99            84      29.17647 154.3137 35.4
##    DiabetesPedigreeFunction Age
## 6                     0.201  30
## 7                     0.248  26
## 9                     0.158  53
## 14                    0.398  59
## 17                    0.551  31
## 22                    0.388  50

Random Forest Probabilities

y_prob = predict(RF,
                 newdata = diab.test1,
                 type="prob")

head(y_prob)
##        0     1
## 6  0.920 0.080
## 7  0.922 0.078
## 9  0.288 0.712
## 14 0.308 0.692
## 17 0.328 0.672
## 22 0.696 0.304

Random Forest Prediction

# Predicting the Test set results
y_pred = predict(RF,
                 newdata = diab.test)
confusionMatrix(diab.test[ , 9],
                y_pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 86 14
##          1 21 33
##                                           
##                Accuracy : 0.7727          
##                  95% CI : (0.6984, 0.8363)
##     No Information Rate : 0.6948          
##     P-Value [Acc > NIR] : 0.02001         
##                                           
##                   Kappa : 0.4856          
##                                           
##  Mcnemar's Test P-Value : 0.31049         
##                                           
##             Sensitivity : 0.8037          
##             Specificity : 0.7021          
##          Pos Pred Value : 0.8600          
##          Neg Pred Value : 0.6111          
##              Prevalence : 0.6948          
##          Detection Rate : 0.5584          
##    Detection Prevalence : 0.6494          
##       Balanced Accuracy : 0.7529          
##                                           
##        'Positive' Class : 0               
## 

Clustering

K-Means Clustering

diab.train2 <- diab.train[1:8]
head(diab.train2)
##   Pregnancies Glucose BloodPressure SkinThickness  Insulin  BMI
## 1           6     148      72.00000      35.00000 154.3137 33.6
## 2           1      85      66.00000      29.00000 154.3137 26.6
## 3           8     183      64.00000      29.17647 154.3137 23.3
## 4           1      89      66.00000      23.00000  94.0000 28.1
## 5           0     137      40.00000      35.00000 168.0000 43.1
## 8          10     115      72.43027      29.17647 154.3137 35.3
##   DiabetesPedigreeFunction Age
## 1                    0.627  50
## 2                    0.351  31
## 3                    0.672  32
## 4                    0.167  21
## 5                    2.288  33
## 8                    0.134  29
wcss2 = vector ()
  for (i in 1:10){ #range of number of clusters, max limit can be any reasonable number
    model = kmeans(diab.train2, i)
    wcss2[i] = sum(model$withinss)
  }

K-Means Find K

wcss = vector()
  for (i in 1:10){
    print(i)
    model = kmeans(diab.train2, i)
    wcss[i] = sum(model$withinss)
  }
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
plot(1:10,
     wcss,
     type = "b",
     main = paste("The Elbow Method"),
     xlab = "Number of Clusters",
     ylab = "WCSS")

kmeans = kmeans(x = diab.train2,
                centers = 2)
head(kmeans$cluster, 10)
##  1  2  3  4  5  8 10 11 12 13 
##  1  1  1  2  1  1  1  1  1  1
head(kmeans$centers, 10)
##   Pregnancies  Glucose BloodPressure SkinThickness   Insulin      BMI
## 1    4.057816 127.1758      73.76700      29.84872 178.91355 32.70388
## 2    2.795918 104.6039      68.18367      27.04082  76.16327 31.50011
##   DiabetesPedigreeFunction      Age
## 1                0.4726959 34.72805
## 2                0.4697143 27.63946
wcss = kmeans$withinss
wcss
## [1] 3117899.1  258757.9
sum(kmeans$withinss)
## [1] 3376657
head(kmeans$tot.withinss, 10)
## [1] 3376657

K-Means Clusters

y_kmeans = kmeans$cluster

clusplot(diab.train2,
         y_kmeans,
         lines = 0,
         shade = FALSE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = "Clusters of Patients")

Hierarchical Clustering

hc = hclust(d = dist(diab.train2,
                     method = "euclidean"),
            method = "ward.D") #Variance within each cluster

plot(hc,
     main = "Dendrogram",
     xlab = "Patients",
     ylab = "Euclidean Distances")

y_hc = cutree(hc, 2)

head(y_hc)
## 1 2 3 4 5 8 
## 1 1 1 1 1 1

Hierarchical Clustering Clusters

clusplot(diab.train2,
         y_hc,
         lines = 0,
         shade = FALSE,
         color = TRUE,
         labels= 2,
         plotchar = FALSE,
         span = TRUE,
         main = 'Clusters of Patients')

Hierarchical Clustering Find K

fviz_nbclust(diab.train2,
             hcut,
             method = "silhouette") +
  labs(subtitle = "Silhouette method")

fviz_nbclust(diab.train2,
             hcut,
             nstart = 25, 
             method = "gap_stat", 
             nboot = 50) +
  labs(subtitle = "Gap statistic method")